In [22]:
import pandas as pd
import numpy as np
pd.set_option('max_rows',10)
%matplotlib inline
path = 'data/result.h5'
pd.__version__
Out[22]:
In [23]:
# inspect
trip = pd.read_csv('data/trip_data_1.csv',
nrows=5,
usecols="medallion,hack_license,pickup_datetime,dropoff_datetime,passenger_count,trip_distance".split(','))
trip
Out[23]:
In [24]:
trip.dtypes
Out[24]:
In [25]:
# object->datetime64ns
pd.to_datetime(trip['dropoff_datetime'],infer_datetime_format=True)
Out[25]:
In [26]:
# object->categorical
trip['medallion'].astype('category')
Out[26]:
In [27]:
# inspect
fare = pd.read_csv('data/trip_fare_1.csv',
nrows=5,
skipinitialspace=True,
usecols="medallion,hack_license,pickup_datetime,total_amount".split(','))
fare
Out[27]:
In [28]:
# merge
pd.merge(trip,fare, on=['medallion','hack_license','pickup_datetime'])
Out[28]:
In [15]:
# to_hdf
trip.to_hdf('trip.hdf','df',mode='w',format='table')
pd.read_hdf('trip.hdf','df')
Out[15]:
In [36]:
# read
# merged
# convert
# to_hdf
def read_and_convert_data():
def parse_dates(df, name):
df[name] = pd.to_datetime(df[name], infer_datetime_format=True)
return df
def parse_cats(df, name):
df[name] = df[name].astype('category')
return df
trip = pd.read_csv('data/trip_data_1.csv',
usecols="medallion,hack_license,pickup_datetime,dropoff_datetime,passenger_count,trip_distance".split(','))
fare = pd.read_csv('data/trip_fare_1.csv',
skipinitialspace=True,
usecols="medallion,hack_license,pickup_datetime,total_amount".split(','))
merged = pd.merge(trip, fare, on=['medallion','hack_license','pickup_datetime'])
merged = parse_dates(merged,'pickup_datetime')
merged = parse_dates(merged,'dropoff_datetime')
merged = parse_cats(merged, 'medallion')
merged = parse_cats(merged, 'hack_license')
merged.to_hdf(path,'df',mode='w',format='table',data_columns=['medallion','hack_license'])
In [37]:
%timeit -n 1 -r 1 read_and_convert_data()
In [30]:
# how much data did we start with
!ls -ltr data/*.csv
In [31]:
# how do we have now
!ls -ltr data/*.h5
In [39]:
df = pd.read_hdf(path,'df')
df.info()
In [40]:
# uniques for the medallions
len(df.medallion.cat.categories)
Out[40]:
In [41]:
# uniques for the hack license
len(df.hack_license.cat.categories)
Out[41]:
In [42]:
# lengh of the ride
df['length'] = df['dropoff_datetime']-df['pickup_datetime']
In [43]:
df['length'].describe()
Out[43]:
In [44]:
# replace invalid values
df.loc[df['length']<0,'length'] = np.nan
df['length'].describe()
Out[44]:
In [45]:
# select based on a subset
df.loc[df['length'].nlargest(5).index]
Out[45]:
In [46]:
# histogramming
df['passenger_count'].value_counts()
Out[46]:
In [47]:
df['passenger_count'].value_counts().hist(xrot=45)
Out[47]:
In [48]:
# we have dates
df['pickup_datetime'].describe()
Out[48]:
In [49]:
df.set_index('pickup_datetime')
Out[49]:
In [50]:
# fare over time
fare_over_time = df.set_index('pickup_datetime')['total_amount'].resample('D',how='mean')
fare_over_time
Out[50]:
In [51]:
fare_over_time.plot()
Out[51]:
In [52]:
# focus on distance
df['trip_distance'].describe()
Out[52]:
In [53]:
length_in_hours = df['length'] / np.timedelta64(1,'h')
length_in_hours
Out[53]:
In [54]:
# new frame via assign
r = df.assign(speed=df['trip_distance']/length_in_hours)
r
Out[54]:
In [55]:
r['speed'].describe()
Out[55]:
In [56]:
# bad values
r.loc[np.isinf(r['speed']),'speed'] = np.nan
r['speed'].describe()
Out[56]:
In [57]:
r['speed'] = r['speed'].clip_upper(100)
r['speed'].describe()
Out[57]:
In [58]:
r['total_amount'].describe()
Out[58]:
In [59]:
r['total_amount'].corr(r['trip_distance'])
Out[59]:
In [60]:
r['total_amount'].corr(r['speed'])
Out[60]:
In [61]:
# quantile the distance
g = r.groupby(pd.qcut(r['trip_distance'],10))
In [64]:
td = g.mean()
td
Out[64]:
In [65]:
td.plot('passenger_count','total_amount',kind='scatter')
Out[65]:
In [66]:
r.groupby(r['hack_license'])['total_amount'].sum().order(ascending=False).describe()
Out[66]:
In [ ]:
# resources